E
T
P
T
P
E
E
T
P
T
•
k
f
:
R
n
→
{
1
,
.
.
.
,
k
}
f
(
x
)
x
f
•
x
n
2
n
•
f
:
R
n
→
R
•
•
•
•
•
•
x
∈
R
n
x
i
x
•
˜
x
∈
R
n
x
∈
R
n
x
˜
x
P
(
x
|
˜
x
)
•
p
:
R
n
→
R
p
(
x
)
x
x
P
p
(
x
)
x
i
x
−
i
p
(
x
i
|
x
−
i
)
p
(
x
)
P
P
T
E
x
p
(
x
)
x
y
y
x
p
(
y
|
x
)
y
x
∈
R
n
p
(
x
) = Π
n
i
=1
p
(
i
|
1
,
.
.
.
,
i
−
1
)
.
p
(
x
)
n
p
(
y
|
x
)
p
(
x
,
y
)
p
(
y
|
x
) =
p
(
x
,
y
)
y
p
(
x
,
y
)
.
X
∈
R
150
×
4
X
i,
1
i
X
i,
2
i
m
m
{
x
(1)
,
x
(2)
,
.
.
.
,
x
(
m
)
}
x
(
i
)
x
(
j
)
X
y
y
i
i
x
∈
R
n
y
∈
R
ˆ
y
y
ˆ
y
=
w
x
w
∈
R
n
w
i
x
i
w
x
i
w
i
ˆ
y
T
y
x
ˆ
y
=
w
x
P
m
y
X
(
)
y
(
)
ˆ
y
(
)
=
1
m
i
(
ˆ
y
(
)
−
y
(
)
)
2
i
.
ˆ
y
(
)
=
y
(
)
=
1
m
||
ˆ
y
(
)
−
y
(
)
||
2
2
,
w
(
X
(
)
,
y
(
)
)
0
∇
w
= 0
⇒
∇
w
1
m
||
ˆ
y
(
)
−
y
(
)
||
2
2
= 0
⇒
1
m
∇
w
||
X
(
)
w
−
y
(
)
||
2
2
= 0
⇒
∇
w
(
X
(
)
w
−
y
(
)
)
(
X
(
)
w
−
y
(
)
) = 0
⇒
∇
w
(
w
X
(
)
X
(
)
w
−
2
w
X
(
)
y
(
)
+
y
(
)
y
(
)
) = 0
⇒
2
X
(
)
X
(
)
w
−
2
X
(
)
y
(
)
= 0
−1.0
−0.5
0
.0
0.5
1.0
x
1
−1.5
−1.0
−0.5
0.0
0.5
1.0
1.5
2.0
y
Linea
r regre
ssion e
xampl
e
0.5
1.0
1.5
w
1
0.0
0.5
1.0
1.5
2.0
2.5
MSE
(train)
Optimization of
w
w
w
1
w
1
y
=
w
1
x
w
1
⇒
w
= (
X
(
)
X
(
)
)
−
1
X
(
)
y
(
)
b
ˆ
y
=
w
x
+
b
1
m
(
)
||
X
(
)
w
−
y
(
)
||
2
2
,
1
m
(
)
||
X
(
)
w
−
y
(
)
||
2
2
p
(
x
,
y
)
w
ˆ
y
=
b
+
wx.
x
2
x
ˆ
y
=
b
+
w
1
x
+
w
2
x
2
.
x
ˆ
y
=
b
+
9
i
=1
w
i
x
i
.
−5
0
5
x
0
−100
−80
−60
−40
−20
0
20
40
y
Underfit
ting
−5
0
5
x
0
−100
−80
−60
−40
−20
0
20
40
y
Approp
riate mode
l complex
ity
−5
0
5
x
0
−100
−80
−60
−40
−20
0
20
40
y
Overfitt
ing
x
y
X
y
Capacit
y
training
error
generalization
error
Error
generalization gap
optimal
capacit
y
Ov
erfitting zone
Underfitting zone
x
ˆ
y
=
y
i
i
= arg
min
||
X
i,
:
−
x
||
2
2
x
y
x
y
y
p
(
x
,
y
)
10
0
10
1
10
2
10
3
10
4
10
5
# train examples
0
1
2
3
4
5
6
Error (MSE)
Bayes error
Train (quadratic)
Test (quadratic)
Test (optimal capacity)
Train (optimal capacity)
10
0
10
1
10
2
10
3
10
4
10
5
# train examples
0
5
10
15
20
Optimal capacity (polynomial degree)
sin(
x
)
x
J
(
w
)
L
2
J
(
w
) =
+
λ
w
w
,
λ
λ
=
0
λ
J
(
w
)
λ
−5
0
5
x
0
−100
−80
−60
−40
−20
0
20
40
y
Underfitting (excessive
¸
)
−5
0
5
x
0
−100
−80
−60
−40
−20
0
20
40
y
Appropriate weight decay (medium
¸
)
−5
0
5
x
0
−100
−80
−60
−40
−20
0
20
40
y
Overfitting,
¸
!
0
λ
λ
λ
λ
= 0
.
k
k
k
k
k
k
−
1
k
−
1
k
−
1
θ
ˆ
θ
{
x
(1)
,
.
.
.
,
x
(
m
)
}
m
ˆ
θ
m
=
g
(
x
(1)
,
.
.
.
,
x
(
m
)
)
.
g
(
x
(1)
,
.
.
.
,
x
(
m
)
)
θ
ˆ
θ
ˆ
θ
y
x
f
(
x
)
y
x
y
=
f
(
x
)
+
y
x
f
ˆ
f
ˆ
f
ˆ
f
x
y
ˆ
θ
ˆ
f
bias(
ˆ
θ
m
) =
E
(
ˆ
θ
m
)
−
θ
θ
θ
ˆ
θ
m
bias(
ˆ
θ
m
)
=
0
E
(
ˆ
θ
m
)
=
θ
ˆ
θ
m
lim
m
→∞
bias(
ˆ
θ
m
)
=
0
lim
m
→∞
E
(
ˆ
θ
m
) =
θ
{
x
(1)
,
.
.
.
,
x
(
m
)
}
x
(
i
)
∈
{
0
,
1
}
i
∈
[1
,
m
]
P
(
x
(
i
)
;
θ
) =
θ
x
(
i
)
(1
−
θ
)
(1
−
x
(
i
)
)
ˆ
θ
m
=
1
m
m
i
=1
x
(
i
)
bias(
ˆ
θ
m
) =
E
[
ˆ
θ
m
]
−
θ
=
E
1
m
m
i
=1
x
(
i
)
−
θ
=
1
m
m
i
=1
E
x
(
i
)
−
θ
=
1
m
m
i
=1
1
x
(
i
)
=0
x
(
i
)
θ
x
(
i
)
(1
−
θ
)
(1
−
x
(
i
)
)
−
θ
=
1
m
m
i
=1
(
θ
)
−
θ
=
θ
−
θ
= 0
bias(
ˆ
θ
) = 0
ˆ
θ
{
x
(1)
,
.
.
.
,
x
(
m
)
}
x
(
i
)
∼
Gaussian(
µ,
σ
2
)
i
∈
[1
,
m
]
p
(
x
(
i
)
;
µ,
σ
2
) =
1
√
2
π
σ
2
exp
−
1
2
(
x
(
i
)
−
µ
)
2
σ
2
ˆ
µ
m
=
1
m
m
i
=1
x
(
i
)
bias(
ˆ
µ
m
) =
E
[
ˆ
µ
m
]
−
µ
=
E
1
m
m
i
=1
x
(
i
)
−
µ
=
1
m
m
i
=1
E
x
(
i
)
−
µ
=
1
m
m
i
=1
µ
−
µ
=
µ
−
µ
= 0
σ
2
σ
2
ˆ
σ
2
m
=
1
m
m
i
=1
x
(
i
)
−
ˆ
µ
m
2
,
ˆ
µ
m
bias(
ˆ
σ
2
m
) =
E
[
ˆ
σ
2
m
]
−
σ
2
E
[
ˆ
σ
2
m
]
E
[
ˆ
σ
2
m
] =
E
1
m
m
i
=1
x
(
i
)
−
ˆ
µ
m
2
=
E
1
m
m
i
=1
(
x
(
i
)
)
2
−
2
x
(
i
)
ˆ
µ
m
+
ˆ
µ
2
m
=
1
m
m
i
=1
E
(
x
(
i
)
)
2
−
2
E
x
(
i
)
1
m
m
j
=1
x
(
j
)
+
E
1
m
m
j
=1
x
(
j
)
1
m
m
k
=1
x
(
k
)
=
1
m
m
i
=1
1
−
2
m
E
(
x
(
i
)
)
2
−
2
m
j
=
i
E
x
(
i
)
x
(
j
)
+
1
m
2
m
j
=1
E
(
x
(
j
)
)
2
+
1
m
2
m
j
=1
k
=
j
E
x
(
j
)
x
(
k
)
=
1
m
m
i
=1
(
m
−
2
m
)(
µ
2
+
σ
2
)
−
2(
m
−
1)
m
(
µ
2
)
+
1
m
(
µ
2
+
σ
2
)
+
(
m
−
1)
m
(
µ
2
)
=
m
−
1
m
σ
2
ˆ
σ
2
m
−
σ
2
/m
˜
σ
2
m
=
1
m
−
1
m
i
=1
x
(
i
)
−
ˆ
µ
m
2
E
[
˜
σ
2
m
] =
σ
2
E
[
˜
σ
2
m
] =
E
1
m
−
1
m
i
=1
x
(
i
)
−
ˆ
µ
m
2
=
m
m
−
1
E
[
ˆ
σ
2
m
]
=
m
m
−
1
m
−
1
m
σ
2
=
σ
2
.
V
ar(
ˆ
θ
) =
E
[
ˆ
θ
2
]
−
E
[
ˆ
θ
]
2
se
ˆ
θ
SE(
ˆ
θ
) =
V
ar[
ˆ
θ
]
.
SE(
ˆ
µ
m
) =
V
ar[
1
m
m
i
=1
x
(
i
)
] =
σ
√
m
σ
2
x
i
σ
ˆ
µ
m
(
ˆ
µ
m
−
1
.
96SE(
ˆ
µ
m
)
,
ˆ
µ
m
+
1
.
96SE(
ˆ
µ
m
))
ˆ
µ
m
SE(
ˆ
µ
m
)
2
{
x
(1)
,
.
.
.
,
x
(
m
)
}
P
(
x
(
i
)
;
θ
)
=
θ
x
(
i
)
(1
−
θ
)
(1
−
x
(
i
)
)
ˆ
θ
m
=
1
m
m
i
=1
x
(
i
)
V
ar
ˆ
θ
m
= V
ar
1
m
m
i
=1
x
(
i
)
=
1
m
2
m
i
=1
V
ar
x
(
i
)
=
1
m
2
m
i
=1
θ
(1
−
θ
)
=
1
m
2
mθ
(1
−
θ
)
=
1
m
θ
(1
−
θ
)
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
x
(
i
)
∼
Gaussian(
µ,
σ
2
)
i
∈
[1
,
m
]
ˆ
σ
2
m
=
1
m
m
i
=1
x
(1)
−
ˆ
µ
m
2
,
˜
σ
2
m
=
1
m
−
1
m
i
=1
x
(1)
−
ˆ
µ
m
2
.
m
−
1
σ
2
ˆ
σ
2
χ
2
χ
2
m
−
1
2(
m
−
1)
V
ar
m
−
1
σ
2
˜
σ
2
= 2(
m
−
1)
(
m
−
1)
2
σ
4
V
ar
˜
σ
2
= 2(
m
−
1)
V
ar
˜
σ
2
=
2
σ
4
(
m
−
1)
ˆ
σ
2
=
m
−
1
m
˜
σ
2
˜
σ
2
χ
2
V
ar
ˆ
σ
2
=
2(
m
−
1)
σ
4
m
2
V
ar
˜
σ
2
=
m
m
−
1
2
V
ar
ˆ
σ
2
V
ar
˜
σ
2
>
V
ar
ˆ
σ
2
˜
σ
2
ˆ
σ
2
˜
σ
2
MSE =
E
[(
ˆ
θ
n
−
θ
)
2
]
= Bias(
ˆ
θ
n
)
2
+
V
ar(
ˆ
θ
n
)
θ
Capacit
y
bias
generalization
error
v
ariance
optimal
capacit
y
Ov
erfitting zone
Underfitting zone
x
ˆ
σ
2
˜
σ
2
ˆ
σ
2
˜
σ
2
ˆ
σ
2
MSE(
ˆ
σ
2
m
) = Bias(
ˆ
σ
2
m
)
2
+
V
ar(
ˆ
σ
2
m
)
=
−
σ
2
m
2
+
2(
m
−
1)
σ
4
m
2
=
1
+
2(
m
−
1)
m
2
σ
4
=
2
m
−
1
m
2
σ
4
MSE(
˜
σ
2
m
) = Bias(
˜
σ
2
m
)
2
+
V
ar(
˜
σ
2
m
)
= 0
+
2
σ
4
(
m
−
1)
=
2
(
m
−
1)
σ
4
.
˜
σ
2
m
ˆ
σ
2
m
ˆ
σ
2
m
lim
n
→∞
ˆ
θ
n
p
→
θ
µ
N
(
µ,
σ
2
)
n
{
x
1
,
.
.
.
,
x
n
}
p
→
>
0
P
(
|
ˆ
θ
n
−
θ
|
>
)
→
0
n
→
∞
ˆ
θ
θ
x
1
ˆ
θ
=
x
1
E
(
ˆ
θ
n
)
=
θ
ˆ
θ
n
→
θ
n
→
∞
m
X
=
{
x
(1)
,
.
.
.
,
x
(
m
)
}
p
(
x
)
p
(
x
;
θ
)
θ
p
(
x
;
θ
)
x
p
(
x
)
θ
θ
ML
= arg
max
θ
p
(
X
;
θ
)
= arg
max
θ
m
i
=1
p
(
x
(
i
)
;
θ
)
arg
max
arg
max
θ
ML
= arg
max
θ
m
i
=1
log
p
(
x
(
i
)
;
θ
)
.
m
θ
ML
= arg
max
θ
E
x
∼
ˆ
p
log
p
(
x
;
θ
)
.
D
KL
(
ˆ
p
p
) =
E
x
∼
ˆ
p
[log ˆ
p
(
x
)
−
log
p
(
x
)]
.
−
E
x
∼
ˆ
p
[log
p
(
x
)]
ˆ
p
p
θ
x
P
(
y
|
x
;
θ
)
y
x
(
x
,
y
)
X
Y
θ
ML
= arg
max
θ
P
(
Y
|
X
;
θ
)
.
θ
ML
= arg
max
θ
m
i
=1
log
P
(
y
(
i
)
|
x
(
i
)
;
θ
)
.
y
x
=
x
µ
(
x
)
x
σ
2
log
P
(
Y
|
X
;
θ
) =
m
i
=1
log
P
(
y
(
i
)
|
x
(
i
)
;
θ
)
=
m
i
=1
−
1
2
σ
2
||
ˆ
y
(
i
)
−
y
(
i
)
||
2
−
m
log
σ
−
m
2
log(2
π
)
ˆ
y
(
i
)
=
µ
(
x
(
i
)
)
i
x
(
i
)
m
y
σ
M
S
E
=
1
m
m
i
=1
||
ˆ
y
(
i
)
−
y
(
i
)
||
2
.
µ
(
x
)
=
w
·
x
µ
(
x
)
y
x
m
→
∞
m
m
m
m
θ
θ
θ
ˆ
θ
θ
θ
p
(
θ
)
θ
θ
{
x
(1)
,
.
.
.
,
x
(
m
)
}
θ
p
(
x
(1)
,
.
.
.
,
x
(
m
)
|
θ
)
p
(
θ
|
x
(1)
,
.
.
.
,
x
(
m
)
) =
p
(
x
(1)
,
.
.
.
,
x
(
m
)
|
θ
)
p
(
θ
)
p
(
x
(1)
,
.
.
.
,
x
(
m
)
)
θ
p
(
θ
|
x
(1)
,
.
.
.
,
x
(
m
)
)
p
(
θ
)
θ
θ
m
x
(
m
+1)
p
(
x
(
m
+1)
|
x
(1)
,
.
.
.
,
x
(
m
)
) =
p
(
x
(
m
+1)
|
θ
)
p
(
θ
|
x
(1)
,
.
.
.
,
x
(
m
)
)
d
θ
θ
{
x
(1)
,
.
.
.
,
x
(
m
)
}
θ
θ
θ
p
(
θ
)
p
(
x
(1)
,
.
.
.
,
x
(
m
)
|
θ
)
En
trop
y(
)
≤
En
trop
y(
)
θ
θ
x
∈
R
n
y
∈
R
w
∈
R
n
ˆ
y
=
w
x
.
m
(
X
(
)
,
y
(
)
)
y
ˆ
y
(
)
=
X
(
)
w
.
y
(
)
p
(
y
(
)
|
X
(
)
,
w
) =
N
(
y
(
)
;
X
(
)
w
,
I
)
∝
exp
−
1
2
(
y
(
)
−
X
(
)
w
)
(
y
(
)
−
X
(
)
w
)
,
y
(
X
(
)
,
y
(
)
)
(
X
,
y
)
w
θ
p
(
w
) =
N
(
w
;
µ
0
,
Λ
0
)
∝
exp
−
1
2
(
w
−
µ
0
)
Λ
−
1
0
(
w
−
µ
0
)
µ
0
Λ
0
Λ
0
= diag(
λ
0
)
p
(
w
|
X
,
y
)
∝
p
(
y
|
X
,
w
)
p
(
w
)
∝
exp
−
1
2
(
y
−
X
w
)
(
y
−
X
w
)
exp
−
1
2
(
w
−
µ
0
)
Λ
−
1
0
(
w
−
µ
0
)
∝
exp
−
1
2
−
2
y
X
w
+
w
X
X
w
+
w
Λ
−
1
0
w
−
2
µ
0
Λ
−
1
0
w
Λ
m
=
X
X
+
Λ
−
1
0
−
1
µ
m
= Λ
m
X
y
+
Λ
−
1
0
µ
0
p
(
w
|
X
,
y
)
∝
exp
−
1
2
(
w
−
µ
m
)
Λ
−
1
m
(
w
−
µ
m
)
+
1
2
µ
m
Λ
−
1
m
µ
m
∝
exp
−
1
2
(
w
−
µ
m
)
Λ
−
1
m
(
w
−
µ
m
)
.
w
µ
m
Λ
m
w
n
p
(
w
|
X
(
)
,
y
(
)
) =
1
(2
π
)
n
|
Λ
m
|
exp
−
1
2
(
w
−
µ
m
)
Λ
−
1
m
(
w
−
µ
m
)
.
θ
θ
θ
MAP
= arg
max
θ
p
(
θ
|
x
) = arg
max
θ
log
p
(
x
|
θ
)
+
log
p
(
θ
)
log
p
(
x
|
θ
)
log
p
(
θ
)
m
(
X
(
)
,
y
(
)
)
y
ˆ
y
(
)
=
X
(
)
w
.
w
∈
R
n
ˆ
w
ML
= (
X
(
)
X
(
)
)
−
1
X
(
)
y
(
)
Λ
0
=
λ
0
I
µ
0
= 0
x
y
ˆ
w
MAP
= Λ
m
X
(
)
y
(
)
µ
0
Λ
0
Λ
m
Λ
m
=
X
(
)
X
(
)
+
λ
−
1
0
I
−
1
µ
0
µ
0
E
[
ˆ
w
ML
] =
w
V
ar(
w
ML
) = (
X
(
)
X
(
)
)
−
1
E
[
ˆ
w
MAP
] =
E
[Λ
m
X
(
)
y
(
)
]
=
E
Λ
m
X
(
)
X
(
)
w
+
= Λ
m
X
(
)
X
(
)
w
+
Λ
m
X
(
)
E
[
]
=
X
(
)
X
(
)
+
λ
−
1
0
I
−
1
X
(
)
X
(
)
w
,
w
w
µ
Bias(
ˆ
w
MAP
) =
E
[
ˆ
w
MAP
]
−
w
=
−
λ
0
X
(
)
X
(
)
+
I
−
1
w
.
λ
0
→
∞
λ
0
→
0
w
V
ar(
ˆ
θ
) =
E
ˆ
θ
2
−
E
ˆ
θ
2
E
ˆ
w
MAP
ˆ
w
M
AP
E
ˆ
w
MAP
ˆ
w
M
AP
=
E
Λ
m
X
(
)
ˆ
y
(
)
y
(
)
X
(
)
Λ
m
=
E
Λ
m
X
(
)
X
(
)
w
+
X
(
)
w
+
X
(
)
Λ
m
= Λ
m
X
(
)
X
(
)
w
w
X
(
)
X
(
)
Λ
m
+
Λ
m
X
(
)
E
X
(
)
Λ
m
= Λ
m
X
(
)
X
(
)
w
w
X
(
)
X
(
)
Λ
m
+
Λ
m
X
(
)
X
(
)
Λ
m
=
E
[
ˆ
w
MAP
]
E
[
ˆ
w
MAP
]
+
X
(
)
X
(
)
Λ
m
E
ˆ
w
MAP
ˆ
w
M
AP
V
ar(
ˆ
w
MAP
) =
E
ˆ
w
M
A
P
ˆ
w
M
AP
−
E
[
ˆ
w
M
A
P
]
E
ˆ
w
M
AP
=
E
[
ˆ
w
MAP
]
E
[
ˆ
w
MAP
]
+
Λ
m
X
(
)
X
(
)
Λ
m
−
E
[
ˆ
w
MAP
]
E
[
ˆ
w
MAP
]
= Λ
m
X
(
)
X
(
)
Λ
m
=
X
(
)
X
(
)
+
λ
−
1
0
I
−
1
X
(
)
X
(
)
×
X
(
)
X
(
)
+
λ
−
1
0
I
−
1
w
x
λ
0
V
ar(
ˆ
w
ML
) =
1
m
i
=1
x
2
i
>
V
ar(
ˆ
w
MAP
) =
λ
0
m
i
=1
x
2
i
(
1+
λ
0
m
i
=1
x
2
i
)
2
log
p
(
θ
)
x
y
y
p
(
y
|
x
)
θ
p
(
y
|
x
;
θ
)
p
(
y
|
x
;
θ
)
=
N
(
y
|
θ
x
,
I
)
p
(
y
= 1
|
x
;
θ
) =
σ
(
θ
x
)
.
w
x
+
b
w
x
+
b
=
b
+
m
i
=1
α
i
x
x
(
i
)
x
(
i
)
α
x
φ
(
x
)
k
(
x
,
x
(
i
)
) =
φ
(
x
)
φ
(
x
(
i
)
)
f
(
x
) =
b
+
i
α
i
k
(
x
,
x
(
i
)
)
.
φ
x
x
φ
α
k
φ
φ
x
φ
(
x
)
x
k
(
x,
x
(
i
)
) = min(
x,
x
(
i
)
)
k
(
u
,
v
) =
N
(
u
−
v
;
0
,
σ
2
I
)
N
(
x
;
µ
,
Σ
)
k
(
u
,
v
)
=
φ
(
x
)
φ
(
x
)
φ
α
i
α
i
k
(
x
,
x
(
i
)
)
α
α
i
k
y
x
k
x
X
y
y
c
c
y
=
1
c
i
=
0
i
k
k
k
x
∈
R
100
x
1
y
=
x
1
x
x
2
x
1
00
x
1
0
1
01
11
111
0
1
011
1111
1110
110
10
010
00
1110
1111
110
10
01
00
010
011
11
111
x
2
>
x
1
k
x
x
x
X
n
×
m
X
E
[
x
] =
0
X
V
ar[
x
] =
1
n
−
1
X
X
z
=
W
x
V
ar[
z
]
X
X
=
U
Σ
W
Σ
n
×
m
X
U
n
×
n
W
m
×
m
X
X
V
ar[
x
] =
1
n
−
1
X
X
=
1
n
−
1
(
U
Σ
W
)
U
Σ
W
=
1
n
−
1
W
Σ
U
U
Σ
W
=
1
n
−
1
W
Σ
2
W
,
U
U
U
=
I
Σ
2
m
×
m
X
i
Σ
2
i,i
z
=
W
x
z
V
ar[
z
] =
1
n
−
1
Z
Z
=
1
n
−
1
W
X
X
W
=
1
n
−
1
W
W
Σ
2
W
W
=
1
n
−
1
Σ
2
X
W
W
W
=
I
X
z
X
W
n
×
n
X
X
x
z
W
Σ
2
z
W
z
(
x
,
y
)
y
X
y
J
(
w
,
b
) =
−
E
x
,y
∼
ˆ
p
p
(
y
|
x
)
,
p
(
y
|
x
)
=
N
(
y
|
x
w
+
b,
1)
J
J
(
θ
) =
E
x
,y
∼
ˆ
p
L
(
x
,
y
,
θ
) =
1
m
m
i
=1
L
(
x
(
i
)
,
y
(
i
)
,
θ
)
y
L
(
x
,
y
,
θ
)
=
log
p
(
y
|
x
;
θ
)
.
J
(
w
,
b
) =
λ
||
w
||
2
2
−
E
x
,y
∼
p
p
(
y
|
x
)
.
L
θ
←
θ
−
∂
L
(
x
(
i
)
,
y
(
i
)
,
θ
)
∂
θ
i
X
J
(
w
) =
E
x
∼
p
||
x
−
r
(
x
;
w
)
||
2
2
w
r
(
x
) =
w
w
x
.
k
×
10
3
= 1000
d
V
O
(
V
d
)
x
x
f
∗
f
∗
(
x
)
≈
f
∗
(
x
+
)
x
x
x
x
k
R
x
k
k
=
1
x
f
f
(
x
) =
b
+
n
i
=1
α
i
k
(
x
,
x
(
i
)
)
n
j
=1
k
(
x
,
x
(
j
)
)
.
k
f
k
k
x
(
i
)
k
α
i
1
/n
y
(
i
)
(
x
(
i
)
,
y
(
i
)
)
N
N
N
k
(
u
,
v
)
u
=
v
f
∗
(
x
)
≈
f
∗
(
x
+
)
(
x
,
y
)
(
x
(
i
)
,
y
(
i
)
)
f
∗
(
x
(
i
)
)
≈
y
(
i
)
x
x
(
i
)
f
∗
(
x
)
≈
y
(
i
)
O
(
N
)
O
(
N
)
O
(
N
)
O
(
N
)
O
(2
N
)
O
(
N
)
n
n
R
n
×
2
100